User:Timothee Flutre/Notebook/Postdoc/2012/09/12

From OpenWetWare

< User:Timothee Flutre | Notebook | Postdoc | 2012 | 09(Difference between revisions)
Jump to: navigation, search
(Entry title: first version (without code))
Current revision (23:31, 22 July 2013) (view source)
(Handling compressed files with gzip in C++: add link to AFLIB by Niels Homer)
 
(2 intermediate revisions not shown.)
Line 12: Line 12:
* The [http://www.cs.unc.edu/Research/compgeom/gzstream/ gzstream] library is often mentioned as a good solution. However, it's likely to be already installed neither on your machine, nor the one of people interested in your code. So you and them will have to install it (or you'll have to distribute it with your own package). Moreover, it doesn't support seek and is unlikely to do so in the near future. Also, if you want your code to successfully read files whether they are compressed or not, you will have to check the extension of the file name by yourself (.gz) and use ifstream if uncompressed and igzstream otherwise: messy...
* The [http://www.cs.unc.edu/Research/compgeom/gzstream/ gzstream] library is often mentioned as a good solution. However, it's likely to be already installed neither on your machine, nor the one of people interested in your code. So you and them will have to install it (or you'll have to distribute it with your own package). Moreover, it doesn't support seek and is unlikely to do so in the near future. Also, if you want your code to successfully read files whether they are compressed or not, you will have to check the extension of the file name by yourself (.gz) and use ifstream if uncompressed and igzstream otherwise: messy...
-
* Why not using [http://www.zlib.net/ zlib] directly? If you work on Linux it's already installed (it's used by the Linux kernel), and if you work on Mac OS it's likely to be already there also. (And it even works on Windows, but who cares?) More importantly, if you're not a professional software developer, it happens to be pretty easy to use zlib. Below is an example code showing how I typically use it in my own C++ code..
+
* Why not using [http://www.zlib.net/ zlib] directly? If you work on Linux it's already installed (it's used by the Linux kernel), and if you work on Mac OS it's likely to be already there also. (And it even works on Windows, but who cares?) More importantly, if you're not a professional software developer, it happens to be pretty easy to use zlib. Below is an example code showing how I typically use it in my own C++ code.
 +
''Disclaimer: I am not a professional programmer, so I'm sure it's possible to do better (please, provide feedback!), but at least it's working smoothly for me and it should help other people as well.''
 +
 +
<nowiki>
 +
// Author: Timothee Flutre
 +
// Aim: show how to use Zlib in your own C++ code
 +
// Compilation: g++ -Wall -Wextra -g test.cpp -lz
 +
// Not copyrighted -- provided to the public domain
 +
 +
#include <cmath>
 +
#include <ctime>
 +
#include <getopt.h>
 +
#include <cerrno>
 +
 +
#include <iostream>
 +
#include <string>
 +
#include <sstream>
 +
using namespace std;
 +
 +
#include "zlib.h"
 +
 +
//-----------------------------------------------------------------------------
 +
// copy-paste and use the four following functions in your own code
 +
 +
void
 +
openFile (
 +
  const string & pathToFile,
 +
  gzFile & fileStream,
 +
  const char * mode)
 +
{
 +
  fileStream = gzopen (pathToFile.c_str(), mode);
 +
  if (fileStream == NULL)
 +
  {
 +
    cerr << "ERROR: can't open file " << pathToFile
 +
        << " with mode " << *mode
 +
        << " (errno=" << errno << ")" << endl;
 +
    exit (1);
 +
  }
 +
}
 +
 +
void
 +
closeFile (
 +
  const string & pathToFile,
 +
  gzFile & fileStream)
 +
{
 +
  int ret = gzclose (fileStream);
 +
  if (ret != Z_OK)
 +
  {
 +
    cerr << "ERROR: can't close the file " << pathToFile
 +
        << ", gzclose() returned " << ret << endl;
 +
    exit (1);
 +
  }
 +
}
 +
 +
int
 +
getline (
 +
  gzFile & fileStream,
 +
  string & line)
 +
{
 +
  int res = 1, c;
 +
  line.clear ();
 +
  while (true)
 +
  {
 +
    c = gzgetc (fileStream);
 +
    if (c == -1) // eof or error
 +
    {
 +
      res = 0;
 +
      break;
 +
    }
 +
    else if (c == 10) // 10 is ASCII code for '\n'
 +
      break;
 +
    else
 +
      line.push_back (c);
 +
  }
 +
  return res;
 +
}
 +
 +
void
 +
gzwriteLine (
 +
  gzFile & fileStream,
 +
  const string & line,
 +
  const string & pathToFile,
 +
  const size_t & lineId)
 +
{
 +
  // if (gzprintf (fileStream, "%s", line.c_str()) <= 0)
 +
  if (gzputs (fileStream, line.c_str()) < 0)
 +
  {
 +
    cerr << "ERROR: can't write line " << lineId
 +
        << " in file " << pathToFile << endl;
 +
    exit (1);
 +
  }
 +
}
 +
 +
//-----------------------------------------------------------------------------
 +
// functions to show how it works
 +
 +
void
 +
readAGzipFileWithZlib ()
 +
{
 +
  cout << __FUNCTION__ << endl;
 +
 +
  // create a dummy file via a system call
 +
  system ("rm -f data.txt.gz; echo -e \"aaa\nbb\nccc\" | gzip > data.txt.gz");
 +
 +
  // read the dummy file using zlib API
 +
  string fileName = "data.txt.gz";
 +
  gzFile fileStream;
 +
  openFile (fileName, fileStream, "rb"); // works also if file not compressed
 +
  string line;
 +
  size_t lineId = 0;
 +
  while (getline (fileStream, line))
 +
  {
 +
    ++lineId;
 +
    cout << "line " << lineId << ": " << line << endl;
 +
  }
 +
  if (! gzeof (fileStream))
 +
  {
 +
    cerr << "ERROR: can't read successfully file "
 +
        << fileName << " up to the end" << endl;
 +
    exit (1);
 +
  }
 +
  closeFile (fileName, fileStream);
 +
 +
  //clean
 +
  system ("rm -f data.txt.gz");
 +
}
 +
 +
void writeAGzipFileWithZlib ()
 +
{
 +
  cout << __FUNCTION__ << endl;
 +
 +
  // write a dummy file using zlib API
 +
  string fileName = "data.txt.gz";
 +
  gzFile fileStream;
 +
  openFile (fileName, fileStream, "wb");
 +
  stringstream ssLine;
 +
  ssLine << "1";
 +
  for (size_t i = 1; i < 10; ++i)
 +
    ssLine << " " << (i+1);
 +
  ssLine << endl;
 +
  gzwriteLine (fileStream, ssLine.str(), fileName, 1);
 +
  closeFile (fileName, fileStream);
 +
 +
  // read it via a system call
 +
  system ("zcat data.txt.gz");
 +
 +
  // clean
 +
  system ("rm -f data.txt.gz");
 +
}
 +
 +
int main (void)
 +
{
 +
  readAGzipFileWithZlib ();
 +
 +
  writeAGzipFileWithZlib ();
 +
 +
  return EXIT_SUCCESS;
 +
}
 +
</nowiki>
 +
 +
* As usual, I'm not the only one who came up with something like this, have a look at [http://nilshomer.com/index.php?title=AFLIB AFLIB] by Niels Homer, a minimal C library integrating stdio, ZLIB and LIBBZIP2.
<!-- ##### DO NOT edit below this line unless you know what you are doing. ##### -->
<!-- ##### DO NOT edit below this line unless you know what you are doing. ##### -->

Current revision

Project name Main project page
Previous entry      Next entry

Handling compressed files with gzip in C++

  • It's more and more common in biology to handle large amount of data, and thus more and more required to work with compressed files. The gzip programs offers a good balance between compression speed and size. That's why high-level languages such as Python and R natively provide ways to handle files compressed with gzip. But what about C++?
  • The gzstream library is often mentioned as a good solution. However, it's likely to be already installed neither on your machine, nor the one of people interested in your code. So you and them will have to install it (or you'll have to distribute it with your own package). Moreover, it doesn't support seek and is unlikely to do so in the near future. Also, if you want your code to successfully read files whether they are compressed or not, you will have to check the extension of the file name by yourself (.gz) and use ifstream if uncompressed and igzstream otherwise: messy...
  • Why not using zlib directly? If you work on Linux it's already installed (it's used by the Linux kernel), and if you work on Mac OS it's likely to be already there also. (And it even works on Windows, but who cares?) More importantly, if you're not a professional software developer, it happens to be pretty easy to use zlib. Below is an example code showing how I typically use it in my own C++ code.

Disclaimer: I am not a professional programmer, so I'm sure it's possible to do better (please, provide feedback!), but at least it's working smoothly for me and it should help other people as well.

// Author: Timothee Flutre
// Aim: show how to use Zlib in your own C++ code
// Compilation: g++ -Wall -Wextra -g test.cpp -lz
// Not copyrighted -- provided to the public domain

#include <cmath>
#include <ctime>
#include <getopt.h>
#include <cerrno>

#include <iostream>
#include <string>
#include <sstream>
using namespace std;

#include "zlib.h"

//-----------------------------------------------------------------------------
// copy-paste and use the four following functions in your own code

void
openFile (
  const string & pathToFile,
  gzFile & fileStream,
  const char * mode)
{
  fileStream = gzopen (pathToFile.c_str(), mode);
  if (fileStream == NULL)
  {
    cerr << "ERROR: can't open file " << pathToFile
         << " with mode " << *mode
         << " (errno=" << errno << ")" << endl;
    exit (1);
  }
}

void
closeFile (
  const string & pathToFile,
  gzFile & fileStream)
{
  int ret = gzclose (fileStream);
  if (ret != Z_OK)
  {
    cerr << "ERROR: can't close the file " << pathToFile
         << ", gzclose() returned " << ret << endl;
    exit (1);
  }
}

int
getline (
  gzFile & fileStream,
  string & line)
{
  int res = 1, c;
  line.clear ();
  while (true)
  {
    c = gzgetc (fileStream);
    if (c == -1) // eof or error
    {
      res = 0;
      break;
    }
    else if (c == 10) // 10 is ASCII code for '\n'
      break;
    else
      line.push_back (c);
  }
  return res;
}

void
gzwriteLine (
  gzFile & fileStream,
  const string & line,
  const string & pathToFile,
  const size_t & lineId)
{
  // if (gzprintf (fileStream, "%s", line.c_str()) <= 0)
  if (gzputs (fileStream, line.c_str()) < 0)
  {
    cerr << "ERROR: can't write line " << lineId
         << " in file " << pathToFile << endl;
    exit (1);
  }
}

//-----------------------------------------------------------------------------
// functions to show how it works

void
readAGzipFileWithZlib ()
{
  cout << __FUNCTION__ << endl;

  // create a dummy file via a system call
  system ("rm -f data.txt.gz; echo -e \"aaa\nbb\nccc\" | gzip > data.txt.gz");

  // read the dummy file using zlib API
  string fileName = "data.txt.gz";
  gzFile fileStream;
  openFile (fileName, fileStream, "rb"); // works also if file not compressed
  string line;
  size_t lineId = 0;
  while (getline (fileStream, line))
  {
    ++lineId;
    cout << "line " << lineId << ": " << line << endl;
  }
  if (! gzeof (fileStream))
  {
    cerr << "ERROR: can't read successfully file "
         << fileName << " up to the end" << endl;
    exit (1);
  }
  closeFile (fileName, fileStream);

  //clean
  system ("rm -f data.txt.gz");
}

void writeAGzipFileWithZlib ()
{
  cout << __FUNCTION__ << endl;

  // write a dummy file using zlib API
  string fileName = "data.txt.gz";
  gzFile fileStream;
  openFile (fileName, fileStream, "wb");
  stringstream ssLine;
  ssLine << "1";
  for (size_t i = 1; i < 10; ++i)
    ssLine << " " << (i+1);
  ssLine << endl;
  gzwriteLine (fileStream, ssLine.str(), fileName, 1);
  closeFile (fileName, fileStream);

  // read it via a system call
  system ("zcat data.txt.gz");

  // clean
  system ("rm -f data.txt.gz");
}

int main (void)
{
  readAGzipFileWithZlib ();

  writeAGzipFileWithZlib ();

  return EXIT_SUCCESS;
}

  • As usual, I'm not the only one who came up with something like this, have a look at AFLIB by Niels Homer, a minimal C library integrating stdio, ZLIB and LIBBZIP2.


Personal tools