# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Mobius Forensic Toolkit
# Copyright (C) 2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020 Eduardo Aguiar
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
import pymobius.browser
import mobius
import datetime

ANT_ID = 'text-search'
ANT_NAME = 'Text Search'
ANT_VERSION = '1.0'

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# @brief WWW paths structure
# Host -> [(start1, end1)...(startn,endn)]
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
WWW_PATHS = {
  'thepiratebay.se' : [('/search/', '')],
  'thepiratebay.sx' : [('/search/', '')],
  'www.armacaseira.net' : [('/search/label/', '')],
  'www.clickjogos.com.br': [('/busca/', '')],
  'www.facebook.com' :
          [('/search/str/', '/keywords_search'),
           ('/search/str/', '/keywords_users'),
           ('/search/str/', '/keywords_groups')],
  'www.frasescurtas.com.br' : [('/search/label/', '')],
  'www.google.com' : [('/maps/search/', '')],
  'www.google.com.br' : [('/maps/search/', '')],
  'www.itatiaia.com.br' : [('/busca/', '')],
  'www.wine-searcher.com' : [('/find/', '')],
  'www.xnxx.com' : [('/search/','')]
}

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# @brief WWW URL query structure
# (Host, path) -> query variable ('*' = any host)
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
WWW_QUERIES = {
  ('*', '/busca') : 'q',
  ('*', '/busca/') : 'q',
  ('*', '/search') : 'q',
  ('*', '/search/') : 'query',
  ('*', '/search/sss') : 'query',
  ('*', '/search.php') : 'q',
  ('br.answers.search.yahoo.com', '/search') : 'p',
  ('br.search.yahoo.com', '/search') : 'p',
  ('busca.drogaraia.com.br', '/search') : 'w',
  ('busca.onofre.com.br', '/search') : 'w',
  ('empresas.americanas.com.br', '/busca') : 'conteudo',
  ('empresas.americanas.com.br', '/busca/') : 'conteudo',
  ('esportes.centauro.com.br', '/search') : 'w',
  ('megafilmesahd.com', '/') : 's',
  ('michaelis.uol.com.br', '/busca') : 'palavra',
  ('nenety.com.br', '/busca/') : 'busca',
  ('pt.pornhub.com', '/video/search') : 'search',
  ('search.conduit.com', '/ResultsExt.aspx') : 'q',
  ('search.incredibar.com', '/') : 'q',
  ('search.snapdo.com', '/') : 'q',
  ('search.sidecubes.com', '/') : 'q',
  ('support.apple.com', '/kb/index') : 'q',
  ('support.google.com', '/a/search') : 'q',
  ('support.google.com', '/drive/search') : 'q',
  ('torrentz2.eu', '/search') : 'f',
  ('www.acessoriostrendparts.com.br', '/loja/busca.php') : 'palavra_busca',
  ('www.aliexpress.com', '/wholesale') : 'SearchText',
  ('www.almg.gov.br', '/busca/busca_geral.html') : 'busca',
  ('www.americanas.com.br', '/busca') : 'conteudo',
  ('www.americanas.com.br', '/busca/') : 'conteudo',
  ('www.ashleymadison.com', '/') : 'keywords',
  ('www.baixaki.com.br', '/busca.asp') : 'q',
  ('www.behance.net', '/search') : 'search',
  ('www.bobwards.com', '/search.cfm') : 'q',
  ('www.consultasocio.com', '/buscar/') : 'keyword',
  ('www.copart.com.br', '/br/search') : 'ocN',
  ('www.dafiti.com.br', '/catalog/') : 'q',
  ('www.etna.com.br', '/search/') : 'text',
  ('www.dicio.com.br', '/pesquisa.php') : 'q',
  ('www.facebook.com', '/search/results.php') : 'q',
  ('www.facebook.com', '/search/pages/') : 'q',
  ('www.facebook.com', '/search/people/') : 'q',
  ('www.facebook.com', '/search/posts/') : 'q',
  ('www.facebook.com', '/search/top/') : 'q',
  ('www.facebook.com', '/search/videos/') : 'q',
  ('www.filecrop.com', '/search.php') : 'w',
  ('www.filmesonlinegratis.net', '/') : 's',
  ('www.freepik.com', '/index.php') : 'k',
  ('www.garmin.com', '/en-US/search/') : 'query',
  ('www.hoteis.com', '/search.do') : 'q-destination',
  ('www.hoteis.com', '/search/searchmap.html') : 'q-destination',
  ('www.kboing.com.br', '/buscamusica.php') : 'palavra',
  ('www.indeed.com.br', '/jobs') : 'q',
  ('www.istockphoto.com', '/br/search/2/image') : 'phrase',
  ('www.jusbrasil.com.br', '/jurisprudencia/busca') : 'q',
  ('www.neopets.com', '/games/arcade_more.phtml') : 'search_game',
  ('www.netshoes.com.br', '/search') : 'Ntt',
  ('www.opensubtitles.org', '/pt/search2') : 'MovieName',
  ('www.portaltudoaqui.com.br', '/busca.php') : 'q',
  ('www.pornhub.com', '/video/search') : 'search',
  ('www.shoptime.com.br', '/busca/') : 'query',
  ('www.submarino.com.br', '/busca') : 'conteudo',
  ('www.walmart.com.br', '/busca/') : 'ft',
  ('www.wines4u.com.br', '/catalogsearch/result/') : 'q',
  ('www.xvideos.com', '/') : 'k',
  ('www.youtube.com', '/results') : 'search_query',
}

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# @brief WWW URL fragment structure
# (Host, path) -> fragment variable
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
WWW_FRAGMENTS = {
  ('www.google.com.br', '/webhp') : 'q',
  ('www.google.com.br', '/search') : 'q',
}

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# @brief Text Search class
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
class TextSearch (object):

  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  # @brief Initialize object
  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  def __init__ (self):
    pass

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# @brief Parse URL query
# @param q Query
# @return var/value dict
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
def parse_query (q):
  values = {}

  for v in q.split ('&'):
    try:
      name, value = v.split ('=', 1)
      value = value.replace ('+', ' ')
      values[name] = value
    except Exception, e:
      pass
    
  return values

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# @brief Ant: Text Search
# @author Eduardo Aguiar
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
class Ant (object):

  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  # @brief Initialize object
  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  def __init__ (self, item):
    self.id = ANT_ID
    self.name = ANT_NAME
    self.version = ANT_VERSION
    self.__item = item

  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  # @brief Run ant
  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  def run (self):
    if not self.__item.datasource:
      return

    self.__entries = []
    self.__retrieve_browser_history ()

    self.entries = self.__entries
    self.__save_data ()

  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  # @brief Retrieve data from browser history
  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  def __retrieve_browser_history (self):
    model = pymobius.browser.model (self.__item)
    
    for h in model.history:
      uri = mobius.io.uri (h.url)
      host = uri.get_host ()

      text = self.__retrieve_browser_history_fragment (h) or            \
             self.__retrieve_browser_history_query (h) or               \
             self.__retrieve_browser_history_path (h)

      if text and 'local time' not in h.timestamp:
        ts = TextSearch ()
        ts.timestamp = datetime.datetime.strptime (h.timestamp, '%Y-%m-%d %H:%M:%S')
        ts.type = 'web@' + host
        ts.text = text
        ts.username = h.username

        ts.metadata = mobius.pod.map ()
        ts.metadata.set ('Profile path', h.profile_path)
        ts.metadata.set ('URL', h.url)
        ts.metadata.set ('Host', host)
        ts.metadata.set ('Application ID', h.app)
        ts.metadata.set ('Application', h.app_name)

        self.__entries.append (ts)

      else:
        if 'busca' in h.url or 'search' in h.url:
          mobius.core.log ('DEV ant.text_search (URL): ' + h.url)
        
  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  # @brief Retrieve data from browser history using subpath
  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  def __retrieve_browser_history_path (self, h):
    text = ''

    try:
      uri = mobius.io.uri (h.url)
      host = uri.get_host ()
      path = uri.get_path ()

      for l, r in WWW_PATHS.get (host, []):
        if path.startswith (l):
          if r:
            if path.endswith (r):
              text = path[len (l):-len(r)]
          else:
            pos = path.find ('/', len (l))

            if pos == -1:
              text = path[len (l):]
            else:
              text = path[len (l):pos]

        if text:
          text = text.replace ('+', ' ')
          break
    except Exception, e:
      pass

    return text

  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  # @brief Retrieve data from browser history using query variables
  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  def __retrieve_browser_history_query (self, h):
    text = ''

    try:
      uri = mobius.io.uri (h.url)
      host = uri.get_host ()
      path = uri.get_path ()
      query = uri.get_query ()

      if query:
        var = WWW_QUERIES.get ((host, path)) or WWW_QUERIES.get (('*', path))              
        values = parse_query (query)
        text = values.get (var)
    except Exception, e:
      pass

    return text

  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  # @brief Retrieve data from browser history using fragment variables
  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  def __retrieve_browser_history_fragment (self, h):
    text = ''

    try:
      uri = mobius.io.uri (h.url)
      host = uri.get_host ()
      path = uri.get_path ()
      fragment = uri.get_fragment ()

      if fragment and (host,path) in WWW_FRAGMENTS:
        var = WWW_FRAGMENTS.get ((host, path))
        values = parse_query (fragment)
        text = values.get (var)
    except Exception, e:
      pass

    return text

  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  # @brief Save data into model
  # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  def __save_data (self):
    case = self.__item.case
    transaction = case.new_transaction ()

    # remove old data
    self.__item.remove_text_searches ()

    # save text searches
    for ts in self.__entries:
      text_search = self.__item.new_text_search (ts.timestamp, ts.type, ts.text)
      text_search.username = ts.username
      text_search.metadata = ts.metadata

    # set ant run
    self.__item.set_ant (ANT_ID, ANT_NAME, ANT_VERSION)
    transaction.commit ()
