/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.collection;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.net.URLFilter;
import org.apache.xerces.util.DOMUtil;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

/**
 * SubCollection represents a subset of index, you can define url patterns that
 * will indicate that particular page (url) is part of SubCollection.
 */
public class Subcollection extends Configured implements URLFilter {

  public static final String TAG_COLLECTIONS = "subcollections";
  public static final String TAG_COLLECTION = "subcollection";
  public static final String TAG_WHITELIST = "whitelist";
  public static final String TAG_BLACKLIST = "blacklist";
  public static final String TAG_NAME = "name";
  public static final String TAG_KEY = "key";
  public static final String TAG_ID = "id";

  List<String> blackList = new ArrayList<String>();
  List<String> whiteList = new ArrayList<String>();

  /**
   * SubCollection identifier
   */
  String id;

  /**
   * SubCollection key
   */
  String key;

  /**
   * SubCollection name
   */
  String name;

  /**
   * SubCollection whitelist as String
   */
  String wlString;

  /**
   * SubCollection blacklist as String
   */
  String blString;
  
  /**
   * Whether the white and black lists are case sensitive
   */
  boolean caseInsensitive = false;

  /**
   * public Constructor
   * 
   * @param id
   *          Id of SubCollection
   * @param name
   *          Name of SubCollection
   * @param conf A populated {@link Configuration}
   */
  public Subcollection(String id, String name, Configuration conf) {
    this(id, name, null, conf);
  }

  /**
   * public Constructor
   * 
   * @param id
   *          Id of SubCollection
   * @param name
   *          Name of SubCollection
   * @param key SubCollection key
   * @param conf A populated {@link Configuration}
   */
  public Subcollection(String id, String name, String key, Configuration conf) {
    this(conf);
    this.id = id;
    this.key = key;
    this.name = name;
    caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
  }

  public Subcollection(Configuration conf) {
    super(conf);
    caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
  }

  /**
   * @return Returns the name
   */
  public String getName() {
    return name;
  }

  /**
   * @return Returns the key
   */
  public String getKey() {
    return key;
  }

  /**
   * @return Returns the id
   */
  public String getId() {
    return id;
  }

  /**
   * Returns whitelist
   * 
   * @return Whitelist entries
   */
  public List<String> getWhiteList() {
    return whiteList;
  }

  /**
   * Returns whitelist String
   * 
   * @return Whitelist String
   */
  public String getWhiteListString() {
    return wlString;
  }

  /**
   * Returns blacklist String
   * 
   * @return Blacklist String
   */
  public String getBlackListString() {
    return blString;
  }

  /**
   * @param whiteList
   *          The whiteList to set.
   */
  public void setWhiteList(ArrayList<String> whiteList) {
    this.whiteList = whiteList;
  }

  /**
   * Simple "indexOf" currentFilter for matching patterns.
   * 
   * <pre>
   *  rules for evaluation are as follows:
   *  1. if pattern matches in blacklist then url is rejected
   *  2. if pattern matches in whitelist then url is allowed
   *  3. url is rejected
   * </pre>
   * 
   * @see org.apache.nutch.net.URLFilter#filter(java.lang.String)
   */
  @Override
  public String filter(String urlString) {
    // first the blacklist
    Iterator<String> i = blackList.iterator();
    while (i.hasNext()) {
      String row = (String) i.next();
      if (urlString.contains(row))
        return null;
    }

    // then whitelist
    i = whiteList.iterator();
    while (i.hasNext()) {
      String row = (String) i.next();
      if (urlString.contains(row))
        return urlString;
    }
    return null;
  }

  /**
   * Initialize Subcollection from dom element
   * 
   * @param collection A DOM {@link org.w3c.dom.Element} for use 
   * in creating the {@link Subcollection}
   */
  public void initialize(Element collection) {
    this.id = DOMUtil.getChildText(
        collection.getElementsByTagName(TAG_ID).item(0)).trim();
    this.name = DOMUtil.getChildText(
        collection.getElementsByTagName(TAG_NAME).item(0)).trim();
    this.wlString = DOMUtil.getChildText(
        collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim();

    parseList(this.whiteList, wlString);

    // Check if there's a blacklist we need to parse
    NodeList nodeList = collection.getElementsByTagName(TAG_BLACKLIST);
    if (nodeList.getLength() > 0) {
      this.blString = DOMUtil.getChildText(nodeList.item(0)).trim();
      parseList(this.blackList, blString);
    }

    // Check if there's a key element or set default name
    nodeList = collection.getElementsByTagName(TAG_KEY);
    if (nodeList.getLength() == 1) {
      this.key = DOMUtil.getChildText(nodeList.item(0)).trim();
    }
  }

  /**
   * Create a list of patterns from a chunk of text, patterns are separated 
   * with a newline
   * 
   * @param list An initialized {@link List} to insert String patterns.
   * @param text A chunkl fo text (hopefully) containing patterns.
   */
  protected void parseList(List<String> list, String text) {
    list.clear();

    StringTokenizer st = new StringTokenizer(text, "\n\r");

    while (st.hasMoreElements()) {
      String line = (String) st.nextElement();
      line = line.trim();
      if (line.isEmpty())
        continue;
      if (caseInsensitive) {
        line = line.toLowerCase(Locale.ROOT);
      }
      list.add(line);
    }
  }

  /**
   * Set contents of blacklist from String
   * 
   * @param list
   *          the blacklist contents
   */
  public void setBlackList(String list) {
    this.blString = list;
    parseList(blackList, list);
  }

  /**
   * Set contents of whitelist from String
   * 
   * @param list
   *          the whitelist contents
   */
  public void setWhiteList(String list) {
    this.wlString = list;
    parseList(whiteList, list);
  }
}
