Web Dev Ramblings

C# XSLX file reader without any dependencies

Published Thursday, August 18, 2016

The need to load data from an XSLX file for importing into your application is a common one and there are many solutions out there.

However, not wanting to use a large library I Googled around and it seems that it is actually relatively simple to do it yourself, below you'll find a small implementation that has been able to read every XSLX file I've come across so far:

using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Xml; public class XSLXReader { public const String Letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; public List<Sheet> Sheets = new List<Sheet>(); public List<String> SharedStrings = new List<String>(); public static String NumberToLetters(Int32 number) { Int32 dividend = number + 1; String columnName = String.Empty; Int32 modulo; while (dividend > 0) { modulo = (dividend - 1) % 26; columnName = Convert.ToChar(65 + modulo).ToString() + columnName; dividend = (int)((dividend - modulo) / 26); } return columnName; } public static Int32 LettersToNumber(String s) { Int32 number = 0; Char[] cs = s.ToCharArray(); foreach (Char c in cs) { Int32 ix = Letters.IndexOf(c); if (ix == -1) { break; } number *= 26; number += ix + 1; } return Math.Max(0, number - 1); } public class Cell { public String Id; // A1, F5... public String Value; public String Type; } public class Row { public Int32 Id; // 4, 213 public List<Cell> Cells = new List<Cell>(); } public class Sheet { public List<Row> Rows = new List<Row>(); public Dictionary<String, Cell> Cells = new Dictionary<String, Cell>(); public Int32 NumberOfRows; // Number of rows in this document, in Spreadsheet terms, does not need to match Rows.Count public Int32 NumberOfColumns; // Number of columns in this Spreadsheets, based on Excel index public Cell GetCellByRowAndColumn(Int32 row, Int32 column) { String id = XSLXReader.NumberToLetters(column) + (row + 1); return GetCellById(id); } public Cell GetCellByColumnAndRow(Int32 column, Int32 row) { String id = XSLXReader.NumberToLetters(column) + (row + 1); return GetCellById(id); } public Cell GetCellById(String id) { Cell cell = null; if (Cells.TryGetValue(id, out cell)) { return cell; } return null; } public String GetCellValueByRowAndColumn(Int32 row, Int32 column, String defaultValue = "") { Cell cell = GetCellByRowAndColumn(row, column); if (cell == null) { return defaultValue; } return cell.Value; } public String GetCellValueByColumnAndRow(Int32 column, Int32 row, String defaultValue = "") { Cell cell = GetCellByColumnAndRow(column, row); if (cell == null) { return defaultValue; } return cell.Value; } public String GetCellValueById(String id, String defaultValue = "") { Cell cell = GetCellById(id); if (cell == null) { return defaultValue; } return cell.Value; } } public void LoadSheetData(XmlDocument dom) { XmlNodeList rows = dom.GetElementsByTagName("row"); if (rows.Count == 0) { return; } Sheet sheet = new Sheet(); sheet.NumberOfRows = Convert.ToInt32(rows[rows.Count - 1].Attributes["r"].Value); sheet.NumberOfColumns = 0; foreach (XmlElement rw in rows) { Row row = new Row(); row.Id = Convert.ToInt32(rw.GetAttribute("r")); sheet.Rows.Add(row); foreach (XmlElement cn in rw.ChildNodes) { if (cn.Name == "c") { XmlNodeList values = cn.GetElementsByTagName("v"); Cell cell = new Cell(); cell.Id = cn.GetAttribute("r"); cell.Type = cn.GetAttribute("t"); if (values.Count > 0) { cell.Value = values[0].InnerText; if (cell.Type == "s") { Int32 ix = Convert.ToInt32(cell.Value); cell.Value = SharedStrings[ix]; } } sheet.Cells.Add(cell.Id, cell); row.Cells.Add(cell); } } if (row.Cells.Count > 0) { Int32 n = LettersToNumber(row.Cells[row.Cells.Count - 1].Id) + 1; if (n > sheet.NumberOfColumns) { sheet.NumberOfColumns = n; } } } Sheets.Add(sheet); } public void LoadSharedStrings(XmlDocument dom) { XmlNodeList sis = dom.GetElementsByTagName("si"); foreach (XmlNode si in sis) { foreach (XmlNode cn in si.ChildNodes) { if (cn.Name == "t") { SharedStrings.Add(cn.InnerText); } } } // in at least one file (older variant?) if (SharedStrings.Count == 0) { XmlNodeList sstItems = dom.GetElementsByTagName("sstItem"); foreach (XmlNode si in sstItems) { foreach (XmlNode cn in si.ChildNodes) { if (cn.Name == "t") { SharedStrings.Add(cn.InnerText); } } } } } public XmlDocument GetXmlDocumentFromZipArchiveEntry(ZipArchiveEntry entry) { Stream stream = entry.Open(); XmlDocument dom = new XmlDocument(); dom.Load(stream); stream.Close(); return dom; } public Boolean Load(String fileName) { using (Stream stream = System.IO.File.Open(fileName, FileMode.Open)) { ZipArchive zipArchive = new ZipArchive(stream); foreach (ZipArchiveEntry zipEntry in zipArchive.Entries) { if (zipEntry.FullName.StartsWith("xl/sharedStrings.xml")) { LoadSharedStrings(GetXmlDocumentFromZipArchiveEntry(zipEntry)); } } foreach (ZipArchiveEntry zipEntry in zipArchive.Entries) { if (zipEntry.FullName.StartsWith("xl/worksheets/sheet")) { LoadSheetData(GetXmlDocumentFromZipArchiveEntry(zipEntry)); } } stream.Close(); } return true; } }

Slightly less than 250 lines of code, and that is all you need to extra data from an XSLX file. Now, there is no doubt additional processing that may be required to work data into the desired formats, for one there is no explicit handling of converting from the "n" type to an actual number and so on, I'll leave that up to you if you need it.

To parse a file use these lines

XSLXReader reader = new XSLXReader(); reader.Load("SomeFile.xslx");

To dump its content on screen

foreach (XSLXReader.Sheet sheet in reader.Sheets) { foreach (XSLXReader.Row row in sheet.Rows) { foreach (XSLXReader.Cell cell in row.Cells) { Response.Write(cell.Id + " - " + cell.Type + " - " + cell.Value + "<br />"); } } }

Something to consider: the reader will not create cell objects, nor row objects for empty cells and rows. If you need to access the content as a more traditional column row table you can instead use:

for (Int32 row = 0; row < sheet.NumberOfRows; ++row) { for (Int32 column = 0; column < sheet.NumberOfColumns; ++column) { Response.Write("Row: " + row + ", Column: " + column + ", Value: " + sheet.GetCellValueByRowAndColumn(row, column) + "<br />"); } }

GetCellValueByRowAndColumn (or GetCellValueByColumnAndRow if you prefer) automatically handles the case where a cell for that row & column combination does not exist and will return an optionally specified value, by default an empty string.

Its also possible to access cells directly by the names used in Excel, for example:

XSLXReader.Cell cell = sheet.GetCellById("D7");

Feel free to use this code in any way you see fit. Its supplied as is, with no expressed warranties etc etc.