<?xml version='1.0' encoding='utf-8'?>
<?xml-stylesheet type="text/xsl" href="/v2/static/oai2.xsl"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
  <responseDate>2026-05-16T19:27:04Z</responseDate>
  <request identifier="oai:figshare.com:article/28381610" metadataPrefix="oai_datacite" verb="GetRecord">https://api.figshare.com/v2/oai</request>
  <GetRecord>
    <record>
      <header>
        <identifier>oai:figshare.com:article/28381610</identifier>
        <datestamp>2025-02-11T11:19:37Z</datestamp>
        <setSpec>category_28849</setSpec>
        <setSpec>category_29134</setSpec>
        <setSpec>category_29137</setSpec>
        <setSpec>category_27328</setSpec>
        <setSpec>category_29263</setSpec>
        <setSpec>category_27322</setSpec>
        <setSpec>portal_549</setSpec>
        <setSpec>item_type_3</setSpec>
        <setSpec>month_year_02_2025</setSpec>
      </header>
      <metadata>
        <resource xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.3/metadata.xsd">
          <identifier identifierType="DOI">10.5522/04/28381610.v1</identifier>
          <alternateIdentifiers>
            <alternateIdentifier alternateIdentifierType="URL">https://figshare.com/articles/dataset/NCSE_v2_0_A_Dataset_of_OCR-Processed_19th_Century_English_Newspapers/28381610</alternateIdentifier>
          </alternateIdentifiers>
          <relatedIdentifiers>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/52260821</relatedIdentifier>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/52260746</relatedIdentifier>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/52256369</relatedIdentifier>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/52256378</relatedIdentifier>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/52260608</relatedIdentifier>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/52260611</relatedIdentifier>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/52260923</relatedIdentifier>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/52260941</relatedIdentifier>
          </relatedIdentifiers>
          <creators>
            <creator>
              <creatorName>Bourne, Jonno</creatorName>
              <givenName>Jonno</givenName>
              <familyName>Bourne</familyName>
              <nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org">0000-0003-2616-3716</nameIdentifier>
            </creator>
          </creators>
          <titles>
            <title><![CDATA[NCSE v2.0: A Dataset of OCR-Processed 19th Century English Newspapers]]></title>
          </titles>
          <subjects>
            <subject>Natural language processing</subject>
            <subject>Library studies</subject>
            <subject>Open access</subject>
            <subject>Digital history</subject>
            <subject>Media studies</subject>
            <subject>British history</subject>
            <subject>newspapers</subject>
            <subject>OCR</subject>
            <subject>NLP</subject>
          </subjects>
          <dates>
            <date dateType="Created">2025-02-11</date>
            <date dateType="Updated">2025-02-11</date>
          </dates>
          <resourceType resourceTypeGeneral="Dataset">Dataset</resourceType>
          <publicationYear>2025</publicationYear>
          <publisher>University College London</publisher>
          <rightsList>
            <rights rightsURI="https://creativecommons.org/licenses/by/4.0/" rightsIdentifier="CC BY 4.0"/>
            <rights rightsURI="http://purl.org/coar/access_right/c_abf2" rightsIdentifier="open access"/>
          </rightsList>
          <descriptions>
            <description descriptionType="Abstract"><![CDATA[NCSE v2.0 Dataset Repository<p dir="ltr">This repository contains the NCSE v2.0 dataset and associated supporting data used in the paper "Reading the unreadable: Creating a dataset of 19th century English newspapers using image-to-text language models".</p><h2>Dataset Overview</h2><p dir="ltr">The NCSE v2.0 is a digitized collection of six 19th-century English periodicals containing:</p><ul><li>82,690 pages</li><li>1.4 million entries</li><li>321 million words</li><li>1.9 billion characters</li></ul><p dir="ltr">The dataset includes:</p><ul><li>1.1 million text entries</li><li>198,000 titles</li><li>17,000 figure descriptions</li><li>16,000 tables</li></ul><h2>Repository Contents</h2><ol><li><b>NCSE v2.0 Dataset</b></li><li><ul><li>NCSE_v2.zip: a folder containing a parquet file for each of the periodicals as well as a readme file.</li></ul></li><li><b>Bounding Box Dataset</b><br>A zip file called bounding_box.zip. Contains</li><li><ul><li>post_process: A folder of the processed periodical bounding box data</li><li>post_process_fill: A folder of the processed periodical bounding box data WITH column filling.</li><li>bbox_readme.txt: a readme file and data description for the bounding boxes</li></ul></li><li><b>Test Sets</b></li><li><ul><li>cropped_images.zip: 378 images cropped from the NCSE test set pages, all 2-bit png files</li><li>ground_truth: 358 text files corresponding to the text from the cropped_images folder</li></ul></li><li><b>Classification Training Data</b><br>The below files are used for training the classification models. They contain 12000 observations 2000 from each periodical. The labels were classified using mistral-large-2411. This data is used to train the ModernBERT classifier described in the paper. The topics are taken from the International Press Telecommunications Council (IPTC) subject codes.</li><li><ul><li>silver_IPTC_class.parquet: IPTC topic classification silver set</li><li>silver_text_type.parquet: Text-type classification silver set</li></ul></li><li><b>Classified Data</b><br>The zip file "classification_data.zip" with all rows classified using the ModernBERT classifer described in the paper.</li><li><ul><li>IPTC_type_classified.zip: contains one parquet file per periodical</li><li>text_type_classified.zip: contains one parquet file per periodical</li><li>classification_readme.md: Description of the data</li></ul></li><li><b>Classification Mappings</b><br>Data for mapping the classification codes to human readable names.</li><li><ul><li>class_mappings.zip: contains a json for each classification type</li><li><ul><li>IPTC_class_mapping.json</li><li>text_type_class_mapping.json</li></ul></li></ul></li></ol><h2>Original Images</h2><p dir="ltr">The original page images can be found at the King's College London Repositories:</p><ul><li><a href="https://doi.org/10.18742/25108519" target="_blank">Monthly Repository</a></li><li><a href="https://doi.org/10.18742/25106895" target="_blank">Northern Star</a></li><li><a href="https://doi.org/10.18742/25103786" target="_blank">Leader</a></li><li><a href="https://doi.org/10.18742/25109275" target="_blank">English Woman's Journal</a></li><li><a href="https://doi.org/10.18742/25109047" target="_blank">Tomahawk</a></li><li><a href="https://doi.org/10.18742/25101957" target="_blank">Publishers' Circular</a></li></ul><p dir="ltr">Or via the project central <a href="https://kclpure.kcl.ac.uk/portal/en/projects/nineteenth-century-serials-edition-investigating-an-archival-futu/datasets/" target="_blank">archive</a></p><h2>Citation</h2><p dir="ltr">If you use this dataset, please cite:</p><pre><pre>No citation data currently available<br></pre></pre><h2>Related Code</h2><p dir="ltr">All original code related to this project including the creation of the datasets and thier analysis can be found at:<br><a href="https://github.com/JonnoB/ereading_the_unreadable" target="_blank">https://github.com/JonnoB/ereading_the_unreadable</a></p><h2>Contact</h2><p dir="ltr">For questions about the dataset, please <a href="https://github.com/JonnoB/ereading_the_unreadable" target="_blank">create an issue</a> in this repository.</p><h2>Usage Rights</h2><p dir="ltr">In keeping with the original NCSE dataset, all data is made available under a Creative Commons Attribution 4.0 International License (CC BY).</p>]]></description>
          </descriptions>
        </resource>
      </metadata>
    </record>
  </GetRecord>
</OAI-PMH>
