<?xml version='1.0' encoding='utf-8'?>
<?xml-stylesheet type="text/xsl" href="/v2/static/oai2.xsl"?>
<OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
  <responseDate>2026-05-07T16:08:38Z</responseDate>
  <request identifier="oai:figshare.com:article/25805008" metadataPrefix="oai_datacite" verb="GetRecord">https://api.figshare.com/v2/oai</request>
  <GetRecord>
    <record>
      <header>
        <identifier>oai:figshare.com:article/25805008</identifier>
        <datestamp>2025-01-02T09:33:48Z</datestamp>
        <setSpec>category_29134</setSpec>
        <setSpec>category_29137</setSpec>
        <setSpec>category_29263</setSpec>
        <setSpec>category_27328</setSpec>
        <setSpec>category_27322</setSpec>
        <setSpec>category_28849</setSpec>
        <setSpec>portal_549</setSpec>
        <setSpec>item_type_3</setSpec>
        <setSpec>month_year_01_2025</setSpec>
      </header>
      <metadata>
        <resource xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.3/metadata.xsd">
          <identifier identifierType="DOI">10.5522/04/25805008.v1</identifier>
          <alternateIdentifiers>
            <alternateIdentifier alternateIdentifierType="URL">https://figshare.com/articles/dataset/Transcribed_newspaper_articles_from_the_NCSE_collection/25805008</alternateIdentifier>
          </alternateIdentifiers>
          <relatedIdentifiers>
            <relatedIdentifier relatedIdentifierType="URL" relationType="HasPart">https://ndownloader.figshare.com/files/46281040</relatedIdentifier>
          </relatedIdentifiers>
          <creators>
            <creator>
              <creatorName>Bourne, Jonno</creatorName>
              <givenName>Jonno</givenName>
              <familyName>Bourne</familyName>
              <nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org">0000-0003-2616-3716</nameIdentifier>
            </creator>
          </creators>
          <titles>
            <title><![CDATA[Transcribed newspaper articles from the NCSE collection]]></title>
          </titles>
          <subjects>
            <subject>Library studies</subject>
            <subject>Open access</subject>
            <subject>Media studies</subject>
            <subject>Digital history</subject>
            <subject>British history</subject>
            <subject>Natural language processing</subject>
            <subject>newspapers</subject>
            <subject>archives</subject>
            <subject>NLP</subject>
            <subject>OCR</subject>
          </subjects>
          <dates>
            <date dateType="Created">2025-01-02</date>
            <date dateType="Updated">2025-01-02</date>
          </dates>
          <resourceType resourceTypeGeneral="Dataset">Dataset</resourceType>
          <publicationYear>2025</publicationYear>
          <publisher>University College London</publisher>
          <rightsList>
            <rights rightsURI="https://creativecommons.org/publicdomain/zero/1.0/" rightsIdentifier="CC0"/>
            <rights rightsURI="http://purl.org/coar/access_right/c_abf2" rightsIdentifier="open access"/>
          </rightsList>
          <descriptions>
            <description descriptionType="Abstract"><![CDATA[<h2>CLOCR-C: Transcribed newspaper articles from the NCSE collection</h2><p dir="ltr">This dataset contains 91 pairs of newspaper articles from the Nineteenth Century Serials Edition (NCSE). The articles are the original OCR from the NCSE and the transcribed equivalent. The data was used in "CLOCR-C: Context Leveraging OCR Correction with Pre-trained Language Models" to demonstrate that pre-trained language models are able to perform post-OCR correction improve the accuracy of corrupted OCR text. The paper is can be found on arxiv at https://arxiv.org/abs/2408.17428</p><h3>Data Details</h3><p dir="ltr">The data set comes from 6 different publications, and is made up of 91 articles, containing a total of 40712 words distributed across the 19th Century.</p><p dir="ltr">The dataset is zip file made up of two sub-files containing 91. Each file shares its name with a corresponding file in the other folder.</p><ul><li>transcription_files: contains .txt files of the transcribed articles</li><li>transcription_raw_ocr: contains .txt files of the original OCR</li></ul><p><br></p><p><br></p>]]></description>
          </descriptions>
        </resource>
      </metadata>
    </record>
  </GetRecord>
</OAI-PMH>
