Skip to content

UnicodeWarning occurred due to character code issues #74

@munchpeep

Description

@munchpeep

I've wrote script to sanitize macros from ole document

# -*- coding: utf-8 -*-

import olefile
import struct
import binascii

class ObliterableOleFileIO(olefile.OleFileIO):

    ENDOFCHAIN = 0xFFFFFFFE #: (-2) end of a virtual stream chain

    def obliterate_stream(self, stream_name):

        def _pos_of(sect_from, offset, fat):
            sect = sect_from
            for i in range(len(fat)):
                while offset >= 512:
                    sect = fat[sect]
                    offset -= 512
                return sect, offset
            else:
                raise IOError('incorrect OLE FAT')

        sid = self._find(stream_name)
        entry = self.direntries[sid]
        if (entry.entry_type != olefile.STGTY_STREAM) and (
                entry.entry_type != olefile.STGTY_STORAGE):
            raise IOError("this is not a stream or strage")
        if entry.size >= self.minisectorcutoff:
            self.write_stream(stream_name, b'\x00' * entry.size)

        try:
          entry.name = u'\u5f5f' + entry.name[1:]
        except:
          entry.name = u'\u5f5f' + unicode(entry.name[1:], 'cp932').encode('UTF-16LE').decode('UTF-16LE')
        entry.name_raw = entry.name.encode('UTF-16LE')
        dirent = struct.pack('<64sHBBIII16sIQQIII',
                             entry.name_raw,
                             entry.namelength,
                             entry.entry_type,
                             entry.color,
                             entry.sid_left,
                             entry.sid_right,
                             entry.sid_child,
                             binascii.unhexlify(entry.clsid.replace('-', '')),
                             entry.dwUserFlags,
                             entry.createTime,
                             entry.modifyTime,
                             entry.isectStart,
                             entry.sizeLow,
                             entry.sizeHigh)

        sect, offset = _pos_of(
            self.first_dir_sector, 128 * entry.sid, self.fat)
        data = self.getsect(sect)
        data = data[:offset] + dirent + data[offset+128:]
        self.write_sect(sect, data)

if __name__ == '__main__':

  ole = ObliterableOleFileIO('test01_b.xls', write_mode=True)
  target_strage = '_VBA_PROJECT_CUR'
  vbaproj_strage = None
  vba_strage = None
  for stream_name in ole.listdir()[:]:
      if stream_name[0] == target_strage:
          ole.obliterate_stream(stream_name)
          if vbaproj_strage is None:
              vbaproj_strage = stream_name[0]
          if len(stream_name) > 2:
              if vba_strage is None and stream_name[1] == 'VBA':
                  vba_strage = stream_name[:2]
  if vba_strage:
      ole.obliterate_stream(vba_strage)
  if vbaproj_strage:
      ole.obliterate_stream(vbaproj_strage)

and this error occurred when sanitizeing ole document contain japanise name stream.

C:\Users\deego\Desktop>python test.py
C:\Python27\lib\site-packages\olefile-0.44-py2.7.egg\olefile\olefile.py:1981: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
if kid.name.lower() == name.lower():

the following patch fixes it.

+++ olefile.py  (working copy)
@@ -1978,6 +1978,10 @@
         node = self.root
         for name in filename:
             for kid in node.kids:
+                if isinstance(kid.name,unicode):
+                    kid.name = kid.name.encode('UTF-16LE')
+                if isinstance(name,unicode):
+                    name = name.encode('UTF-16LE')
                 if kid.name.lower() == name.lower():
                     break
             else:

I am glad if you release a modified version of olefile.

Thank you.

Metadata

Metadata

Assignees

Labels

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions