将纯真ip库转换成普通的文本文件

时间:2022-12-20 11:45:24
网络上的IP数据库以纯真版的最为流行,本文不研究格式只给个将其转换为文本格式的代码。

关于纯真ip数据库的格式。本文参考了 Luma大虾的文章和LumaQQ的IPseeker类的代码,用c语言编写。本来是想写个专门读纯真ip数据库的类的,所以本文的代码本来是个测试代码,后来懒了就没继续做下去,所以代码看上去比较乱,嘿嘿。

//file:getlist.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <assert.h>
#include <string.h>
#include <arpa/inet.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>

#define IP_DBPATH "/usr/local/LumaQQ/QQWry.dat"
#define HEAD_SIZE 8
char* record_mmap;

#define IP_QUAD_FORMAT_STRING "%03u.%03u.%03u.%03u"
#define IP_QUAD_LE(x) /
  *((unsigned char*)x + 3),/
  *((unsigned char*)x + 2),/
  *((unsigned char*)x + 1),/
  *((unsigned char*)x + 0)


void oops(const char* msg)
{
  perror(msg);
  exit(1);
}

//Proto   : unsigned long int_of_4byte_le(const char* data)
//Function: read 4 byte from data, convert it to unsigned
//  integer in little endian order.
//Argument: data in little endian(LE) order
//Return  : value of 4 byte data in LE
unsigned long int_of_4byte_LE(const char* data)
{
  const unsigned char* p = (const unsigned char*)data;
  return ((p[0]) +
      (p[1] <<  8) +
      (p[2] << 16) +
      (p[3] << 24)
      );
}

unsigned long int_of_3byte_LE(const char* data)
{
  const unsigned char* p = (const unsigned char*)data;
  return ((p[0]) +
      (p[1] <<  8) +
      (p[2] << 16)
      );
}

void read_record(off_t offset)
{
  const char* p = record_mmap;
  const char* country=NULL;
  const char* territory=NULL;
 
  printf(IP_QUAD_FORMAT_STRING,IP_QUAD_LE(p+offset));
  p += offset + 4;
  do{
    if(p[0] == 0x01){
      p = record_mmap + int_of_3byte_LE(p + 1);
      continue;
    }
    if(p[0] == 0x02){
      country = record_mmap + int_of_3byte_LE(p + 1);
      p+=4;
      while(p[0] == 0x01 || p[0] == 0x02){
        p = record_mmap + int_of_3byte_LE(p + 1);
      }
      territory = p;
      break;
    }
    country   = p;
    p = strchr(p,0) + 1;
    while(p[0] == 0x01 || p[0] == 0x02){
      p = record_mmap + int_of_3byte_LE(p + 1);
    }
    territory = p;
  }while(!country || !territory);
  printf("/t%s/t%s/n",country,territory);
}

void iterate_index(int db)
{
  char  record[7];
  int   n;
  int   offset;

  do{
    n = read(db,record,7);
    if(n == 0)break;
    assert(n == 7);
    offset = int_of_3byte_LE(record+4);
  /*  printf(IP_QUAD_FORMAT_STRING"@%#010x->",
        IP_QUAD_LE(record),
        offset
        );*/
    read_record(offset);
  }while(1);
}

int main(int argc, char** argv)
{
  char buf[HEAD_SIZE];

  unsigned int index_beg,index_end;
  int db;
  if( (db = open(IP_DBPATH,O_RDONLY)) == -1 )oops("opendb error!");
  if(read(db,buf,HEAD_SIZE) != HEAD_SIZE)oops("read data header error!");
  index_beg=int_of_4byte_LE(buf);
  index_end=int_of_4byte_LE(buf+4);
  assert((index_end - index_beg)%7 == 0);
//printf("index start from:%#08X/n"
//         "          end at:%#08X/n",
//      index_beg,index_end);
  record_mmap = mmap(NULL,index_beg,PROT_READ,MAP_SHARED,db,0);
  if(record_mmap == (void *) -1)oops("mmap error");
  lseek(db,index_beg - 8,SEEK_CUR);
//  printf("indexes:/n");
  iterate_index(db);
}




使用么就执行
./getlist > chunzhenip.GB
./getlist | iconv -f GB -t UTF-8 > chunzhengip.UTF-8
格式是
ip段/t一级地名/t二级地名/n
然后就可以用awk/sed/perl这些文本分析工具来分析了。