This post describes one way that you can read the top N rows from large text files with C#. This is very useful when working with giant files that are too big to open, but you need to view a portion of them to determine the schema, data types, etc.

I’ve used PowerShell many times to do this with large csv files, but in this example we’re going to use C# and look at the Wikipedia XML dump of pages and articles. The 3017-03-01 dump is very large and comes in at 59.5 GB.

Wikipedia XML dump of pages and articles at 59.5 GB

The script is short and simple. All it does is read one line at a time from the file and write it to the console until it reaches the desired row number. For running little scripts like this, I default to using LINQPad.

static void Main()
{
    int counter = 0;
    int rows = 1000;
    string filePath = @"H:\Data\enwiki-20170301-pages-articles.xml\enwiki-20170301-pages-articles.xml";

    string line;
    using (StreamReader reader = new StreamReader(filePath))
    {
        while (counter < rows)
        {
            line = reader.ReadLine();
            Console.WriteLine(line);
            counter++;
        }
    }
}

A portion of the output is shown below. We now have a link to the XML schema, and can browse the first few pages.

<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">
  <siteinfo>
    <sitename>Wikipedia</sitename>
    <dbname>enwiki</dbname>
    <base>https://en.wikipedia.org/wiki/Main_Page</base>
    <generator>MediaWiki 1.29.0-wmf.13</generator>
    <case>first-letter</case>
    <namespaces>
      <namespace key="-2" case="first-letter">Media</namespace>
      <namespace key="-1" case="first-letter">Special</namespace>
      <namespace key="0" case="first-letter" />
      <namespace key="1" case="first-letter">Talk</namespace>
      <namespace key="2" case="first-letter">User</namespace>
      <namespace key="3" case="first-letter">User talk</namespace>
      <namespace key="4" case="first-letter">Wikipedia</namespace>
      <namespace key="5" case="first-letter">Wikipedia talk</namespace>
      <namespace key="6" case="first-letter">File</namespace>
      <namespace key="7" case="first-letter">File talk</namespace>
      <namespace key="8" case="first-letter">MediaWiki</namespace>
      <namespace key="9" case="first-letter">MediaWiki talk</namespace>
      <namespace key="10" case="first-letter">Template</namespace>
      <namespace key="11" case="first-letter">Template talk</namespace>
      <namespace key="12" case="first-letter">Help</namespace>
      <namespace key="13" case="first-letter">Help talk</namespace>
      <namespace key="14" case="first-letter">Category</namespace>
      <namespace key="15" case="first-letter">Category talk</namespace>
      <namespace key="100" case="first-letter">Portal</namespace>
      <namespace key="101" case="first-letter">Portal talk</namespace>
      <namespace key="108" case="first-letter">Book</namespace>
      <namespace key="109" case="first-letter">Book talk</namespace>
      <namespace key="118" case="first-letter">Draft</namespace>
      <namespace key="119" case="first-letter">Draft talk</namespace>
      <namespace key="446" case="first-letter">Education Program</namespace>
      <namespace key="447" case="first-letter">Education Program talk</namespace>
      <namespace key="710" case="first-letter">TimedText</namespace>
      <namespace key="711" case="first-letter">TimedText talk</namespace>
      <namespace key="828" case="first-letter">Module</namespace>
      <namespace key="829" case="first-letter">Module talk</namespace>
      <namespace key="2300" case="first-letter">Gadget</namespace>
      <namespace key="2301" case="first-letter">Gadget talk</namespace>
      <namespace key="2302" case="case-sensitive">Gadget definition</namespace>
      <namespace key="2303" case="case-sensitive">Gadget definition talk</namespace>
    </namespaces>
  </siteinfo>
  <page>
    <title>AccessibleComputing</title>
    <ns>0</ns>
    <id>10</id>
    <redirect title="Computer accessibility" />
    <revision>
      <id>767284433</id>
      <parentid>631144794</parentid>
      <timestamp>2017-02-25T00:30:28Z</timestamp>
      <contributor>
        <username>Godsy</username>
        <id>23257138</id>
      </contributor>
      <comment>[[Template:This is a redirect]] has been deprecated, change to [[Template:Redirect category shell]].</comment>
      <model>wikitext</model>
      <format>text/x-wiki</format>
      <text xml:space="preserve">#REDIRECT [[Computer accessibility]]

{{Redirect category shell|
{{R from move}}
{{R from CamelCase}}
{{R unprintworthy}}
}}</text>
      <sha1>ds1crfrjsn7xv73djcs4e4aq9niwanx</sha1>
    </revision>
  </page>
  <page>
    <title>Anarchism</title>
    <ns>0</ns>
    <id>12</id>
    <revision>
      <id>767587655</id>
      <parentid>767587451</parentid>
      <timestamp>2017-02-26T19:53:52Z</timestamp>
      <contributor>
        <username>MShabazz</username>
        <id>11041638</id>
      </contributor>
      <minor />
      <comment>/* Spanish Revolution */  fixing typo, adding comma</comment>