// ==========================================================================
// Author:  Yee Hsu
// Date:    9/3/2013
//
// Desc:    URL and Email Extractor! Extracts URLs or Email address just by
//          giving the intial URL. The program will crawl the web and look
//          through links it has spidered and extract any URL or Email it
//          can find on the way.
// ==========================================================================

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Threading;
using System.Text.RegularExpressions;
using CommonLib;
using System.Xml;
using System.Diagnostics;

namespace Extractor
{
    public partial class Extractor : Form
    {
        private int iuTotal = 0;
        private int iuIgnored = 0;
        private int iuFiltered = 0;
        private int iuSkipped = 0;

        private int ieTotal = 0;
        private int ieIgnored = 0;
        private int ieFiltered = 0;
        private int ieSkipped = 0;

        private Thread uThread = null;
        private Thread eThread = null;
        private Thread mThread = null;

        private Dictionary<String, Boolean> dUrl = null;
        private Dictionary<String, Boolean> dEmail = null;
        private Dictionary<String, Boolean> qEmail = null;

        public Extractor()
        {
            InitializeComponent();
        }

        private void Form1_Load(object sender, EventArgs e)
        {
            this.button2.Enabled = false;
            this.button3.Enabled = false;
        }

        private void InitUrl()
        {
            this.iuFiltered = 0;
            this.iuIgnored = 0;
            this.iuSkipped = 0;
            this.iuTotal = 0;

            this.uThread = null;
            this.dUrl = new Dictionary<string, bool>();
            this.button1.Enabled = false;
            this.button2.Enabled = true;
            this.textBox1.Enabled = false;
            this.checkBox1.Enabled = false;
            this.checkBox2.Enabled = false;

            this.textBox7.Text = "Extracting URLs";
            this.textBox7.BackColor = Color.LightGreen;
        }

        private void InitEmail()
        {
            this.ieFiltered = 0;
            this.ieIgnored = 0;
            this.ieSkipped = 0;
            this.ieTotal = 0;

            this.eThread = null;
            this.mThread = null;
            this.dEmail = new Dictionary<string, bool>();
            this.qEmail = new Dictionary<string, bool>();
            this.button4.Enabled = false;
            this.button3.Enabled = true;
            this.textBox14.Enabled = false;
            this.checkBox4.Enabled = false;
            this.checkBox3.Enabled = false;

            this.textBox8.Text = "Extracting Emails";
            this.textBox8.BackColor = Color.LightGreen;
        }

        private void TermUrl()
        {
            this.button2.Enabled = false;
            this.button1.Enabled = true;
            this.textBox1.Enabled = true;
            this.checkBox1.Enabled = true;
            this.checkBox2.Enabled = true;

            this.textBox7.Text = "All Done.";
            this.textBox7.BackColor = SystemColors.Control;
        }

        private void TermEmail()
        {
            this.button3.Enabled = false;
            this.button4.Enabled = true;
            this.textBox14.Enabled = true;
            this.checkBox3.Enabled = true;
            this.checkBox4.Enabled = true;

            this.textBox8.Text = "All Done.";
            this.textBox8.BackColor = SystemColors.Control;
        }

        private void button1_Click(object sender, EventArgs e)
        {
            try
            {
                if (this.textBox1.Text.Length > 0)
                {
                    this.InitUrl();
                    uThread = new Thread(ExecuteUrl);
                    uThread.Start();
                }
            }
            catch {              
            }
        }

        private void button4_Click(object sender, EventArgs e)
        {
            try
            {
                if (this.textBox14.Text.Length > 0)
                {
                    this.InitEmail();
                    eThread = new Thread(ExecuteEmail);
                    eThread.Start();

                    mThread = new Thread(ExecuteExtractEmail);
                    mThread.Start();
                }
            }
            catch
            {
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {
            if (uThread != null)
            {
                if (uThread.IsAlive)
                {
                    uThread.Abort();
                    uThread = null;
                }
            }
            this.TermUrl();
        }

        private void button3_Click(object sender, EventArgs e)
        {
            if (eThread != null)
            {
                if (eThread.IsAlive)
                {
                    eThread.Abort();
                    eThread = null;
                }
            }

            if (mThread != null)
            {
                if (mThread.IsAlive)
                {
                    mThread.Abort();
                    mThread = null;
                }
            }
            this.TermEmail();
        }

        private void ExecuteUrl()
        {
            try
            {
                this.ExtractUrls(this.textBox1.Text);
                this.TermUrl();
            }
            catch (System.Exception e) {              
            }
        }

        private void ExecuteEmail()
        {
            try
            {
                this.ExtractEmails(this.textBox14.Text);
                this.TermEmail();
            }
            catch (System.Exception e) {
            }
        }

        private void ExecuteExtractEmail()
        {
            try
            {
                while (mThread != null)
                {
                    LoopThroughListAndExtractEmail();
                    Thread.Sleep(3000);
                }
            }
            catch {
            }
        }

        private void LoopThroughListAndExtractEmail()
        {
            try
            {
                foreach (KeyValuePair<string, bool> kv in this.dEmail)
                {
                    if (kv.Value == true)
                    {
                        HttpWebRequest hwr = (HttpWebRequest)WebRequest.Create(kv.Key);
                        hwr.Timeout = 3000;

                        HttpWebResponse wr = (HttpWebResponse)hwr.GetResponse();
                        StreamReader sr = new StreamReader(wr.GetResponseStream());
                        String[] sEmails = Common.ExtractEmails(sr.ReadToEnd());

                        foreach (String sEmail in sEmails)
                        {
                            if (!this.qEmail.ContainsKey(sEmail))
                            {
                                this.qEmail.Add(sEmail, true);
                                this.richTextBox2.AppendText(sEmail);
                                this.richTextBox2.AppendText("\r\n");
                                this.richTextBox2.Focus();
                                this.richTextBox2.ScrollToCaret();
                                this.textBox10.Text = Convert.ToString(++ieTotal);

                                if (this.checkBox4.Checked)
                                    this.AppendToTxtFileEmail(sEmail);

                                if (this.checkBox3.Checked)
                                    this.AppendToXmlFileEmail(sEmail);
                            }
                        }
                        this.dEmail[kv.Key] = false;
                    }
                } 
            }
            catch {
            }
        }

        private void ExtractUrls(String sUrl)
        {
            try
            {
                if (!sUrl.StartsWith("http"))
                    return;

                if (this.dUrl.ContainsKey(sUrl))
                {
                    this.textBox4.Text = Convert.ToString(++iuFiltered);
                    return;
                }

                if (iuTotal > 50)
                {
                    Random r = new Random();

                    if (r.Next(2) == 1)
                    {
                        this.textBox3.Text = Convert.ToString(++iuIgnored);
                        return;
                    }
                }

                this.dUrl.Add(sUrl, true);
                this.textBox2.Text = sUrl;
                this.richTextBox1.AppendText(sUrl);
                this.richTextBox1.AppendText("\r\n");
                this.richTextBox1.Focus();
                this.richTextBox1.ScrollToCaret();
                this.textBox5.Text = Convert.ToString(++iuTotal);

                if (this.checkBox1.Checked)
                    this.AppendToTxtFileUrl(sUrl);

                if (this.checkBox2.Checked)
                    this.AppendToXmlFileUrl(sUrl);

                HttpWebRequest hwr = (HttpWebRequest)WebRequest.Create(sUrl);
                hwr.Timeout = 10000;

                HttpWebResponse wr = (HttpWebResponse) hwr.GetResponse();
                StreamReader sr = new StreamReader(wr.GetResponseStream());
                String[] sUrls = Common.ExtractURLs(sr.ReadToEnd());

                foreach (String surl in sUrls)
                {
                    if (this.CanIgnoreUrl(surl))
                    {
                        this.textBox3.Text = Convert.ToString(++iuIgnored);
                        return;
                    }
                    else
                    {
                        this.ExtractUrls(surl);
                    }
                }
            }
            catch (System.Exception e)
            {
                this.textBox6.Text = Convert.ToString(++iuSkipped);              
            }
        }

        private void ExtractEmails(String sUrl)
        {
            try
            {
                if (!sUrl.StartsWith("http"))
                    return;

                if (this.dEmail.ContainsKey(sUrl))
                {
                    this.textBox11.Text = Convert.ToString(++ieFiltered);
                    return;
                }

                if (ieTotal > 50)
                {
                    Random r = new Random();

                    if (r.Next(2) == 1)
                    {
                        this.textBox12.Text = Convert.ToString(++ieIgnored);
                        return;
                    }
                }

                this.dEmail.Add(sUrl, true);
                this.textBox13.Text = sUrl;

                HttpWebRequest hwr = (HttpWebRequest)WebRequest.Create(sUrl);
                hwr.Timeout = 10000;

                HttpWebResponse wr = (HttpWebResponse)hwr.GetResponse();
                StreamReader sr = new StreamReader(wr.GetResponseStream());
                String[] sUrls = Common.ExtractURLs(sr.ReadToEnd());

                foreach (String surl in sUrls)
                {
                    if (this.CanIgnoreUrl(surl))
                    {
                        this.textBox12.Text = Convert.ToString(++ieIgnored);
                        return;
                    }
                    else
                    {
                        this.ExtractEmails(surl);
                    }
                }
            }
            catch (System.Exception e)
            {
                this.textBox9.Text = Convert.ToString(++ieSkipped);
            }
        }

        private bool CanIgnoreUrl(string sUrl)
        {
            if (sUrl.EndsWith(".exe") || 
                sUrl.EndsWith(".bin") || 
                sUrl.EndsWith(".tar") ||
                sUrl.EndsWith(".mp3") ||
                sUrl.EndsWith(".avi") ||
                sUrl.EndsWith(".gif") ||
                sUrl.EndsWith(".doc") ||
                sUrl.EndsWith(".pdf") ||
                sUrl.EndsWith(".jpg") ||
                sUrl.EndsWith(".mpeg") ||
                sUrl.EndsWith(".gz"))
            return true;

            return false;
        }

        private void Form1_FormClosing(object sender, FormClosingEventArgs e)
        {
            this.button2_Click(sender, e);
            this.button3_Click(sender, e);
        }

        private void saveFileToolStripMenuItem_Click(object sender, EventArgs e)
        {
            this.saveFileDialog1.ShowDialog();
        }

        private void exitToolStripMenuItem_Click(object sender, EventArgs e)
        {
            Application.Exit();
        }

        private void saveFileDialog1_FileOk(object sender, CancelEventArgs e)
        {
            if (this.tabControl1.SelectedIndex == 0)
            {
                using (TextWriter tw = new StreamWriter(this.saveFileDialog1.FileName))
                {
                    foreach (KeyValuePair<string, bool> kv in this.dUrl)
                    {
                        tw.WriteLine(kv.Key);
                    }
                    tw.Close();
                }
            }
            else
            {
                using (TextWriter tw = new StreamWriter(this.saveFileDialog1.FileName))
                {
                    foreach (KeyValuePair<string, bool> kv in this.qEmail)
                    {
                        tw.WriteLine(kv.Key);
                    }
                    tw.Close();
                }
            }
        }

        private void AppendToTxtFileUrl(String str)
        {
            String sFile = String.Format("{0}ExtractURL_{1}.txt", Common.GetCurrentDirectory(), Common.GetNow_YYYYMMDD());

            using (TextWriter tw = new StreamWriter(sFile, true))
            {
                tw.WriteLine(str);
                tw.Close();
            }
        }

        private void AppendToTxtFileEmail(String str)
        {
            String sFile = String.Format("{0}ExtractEmail_{1}.txt", Common.GetCurrentDirectory(), Common.GetNow_YYYYMMDD());

            using (TextWriter tw = new StreamWriter(sFile, true))
            {
                tw.WriteLine(str);
                tw.Close();
            }
        }

        private void AppendToXmlFileUrl(String str)
        {
            String sFile = String.Format("{0}ExtractURL_{1}.xml", Common.GetCurrentDirectory(), Common.GetNow_YYYYMMDD());

            if (!File.Exists(sFile))
            {
                XmlTextWriter xw = new XmlTextWriter(sFile, System.Text.Encoding.UTF8);
                xw.Formatting = Formatting.Indented;
                xw.WriteProcessingInstruction("xml", "version='1.0' encoding='UTF-8'");
                xw.WriteComment("Epic Extractor - XML Version 5.0");
                xw.WriteStartElement("ExtractedData");
                xw.WriteFullEndElement();
                xw.Close();                 
            }
 
            XmlDocument xd = new XmlDocument();
            xd.Load(sFile);
            XmlElement xRoot = xd.DocumentElement;
            XmlElement xElem = xd.CreateElement("Url");
            XmlText xText = xd.CreateTextNode(str);
            xRoot.AppendChild(xElem);
            xRoot.LastChild.AppendChild(xText);
            xd.Save(sFile);         
        }

        private void AppendToXmlFileEmail(String str)
        {
            String sFile = String.Format("{0}ExtractEmail_{1}.xml", Common.GetCurrentDirectory(), Common.GetNow_YYYYMMDD());

            if (!File.Exists(sFile))
            {
                XmlTextWriter xw = new XmlTextWriter(sFile, System.Text.Encoding.UTF8);
                xw.Formatting = Formatting.Indented;
                xw.WriteProcessingInstruction("xml", "version='1.0' encoding='UTF-8'");
                xw.WriteComment("Epic Extractor - XML Version 5.0");
                xw.WriteStartElement("ExtractedData");
                xw.WriteFullEndElement();
                xw.Close();
            }

            XmlDocument xd = new XmlDocument();
            xd.Load(sFile);
            XmlElement xRoot = xd.DocumentElement;
            XmlElement xElem = xd.CreateElement("Email");
            XmlText xText = xd.CreateTextNode(str);
            xRoot.AppendChild(xElem);
            xRoot.LastChild.AppendChild(xText);
            xd.Save(sFile);
        }

        private void aboutToolStripMenuItem_Click(object sender, EventArgs e)
        {
            About a = new About();
            a.Show();
        }

        private void pictureBox1_Click(object sender, EventArgs e)
        {
            Process.Start("http://www.yahoo.com");
        }

        private void pictureBox2_Click(object sender, EventArgs e)
        {
            Process.Start("http://www.yahoo.com");
        }
    }
}