// ==========================================================================
// Author: Yee Hsu
// Date: 9/3/2013
//
// Desc: URL and Email Extractor! Extracts URLs or Email address just by
// giving the intial URL. The program will crawl the web and look
// through links it has spidered and extract any URL or Email it
// can find on the way.
// ==========================================================================
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Threading;
using System.Text.RegularExpressions;
using CommonLib;
using System.Xml;
using System.Diagnostics;
namespace Extractor
{
public partial class Extractor : Form
{
private int iuTotal = 0;
private int iuIgnored = 0;
private int iuFiltered = 0;
private int iuSkipped = 0;
private int ieTotal = 0;
private int ieIgnored = 0;
private int ieFiltered = 0;
private int ieSkipped = 0;
private Thread uThread = null;
private Thread eThread = null;
private Thread mThread = null;
private Dictionary<String, Boolean> dUrl = null;
private Dictionary<String, Boolean> dEmail = null;
private Dictionary<String, Boolean> qEmail = null;
public Extractor()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
this.button2.Enabled = false;
this.button3.Enabled = false;
}
private void InitUrl()
{
this.iuFiltered = 0;
this.iuIgnored = 0;
this.iuSkipped = 0;
this.iuTotal = 0;
this.uThread = null;
this.dUrl = new Dictionary<string, bool>();
this.button1.Enabled = false;
this.button2.Enabled = true;
this.textBox1.Enabled = false;
this.checkBox1.Enabled = false;
this.checkBox2.Enabled = false;
this.textBox7.Text = "Extracting URLs";
this.textBox7.BackColor = Color.LightGreen;
}
private void InitEmail()
{
this.ieFiltered = 0;
this.ieIgnored = 0;
this.ieSkipped = 0;
this.ieTotal = 0;
this.eThread = null;
this.mThread = null;
this.dEmail = new Dictionary<string, bool>();
this.qEmail = new Dictionary<string, bool>();
this.button4.Enabled = false;
this.button3.Enabled = true;
this.textBox14.Enabled = false;
this.checkBox4.Enabled = false;
this.checkBox3.Enabled = false;
this.textBox8.Text = "Extracting Emails";
this.textBox8.BackColor = Color.LightGreen;
}
private void TermUrl()
{
this.button2.Enabled = false;
this.button1.Enabled = true;
this.textBox1.Enabled = true;
this.checkBox1.Enabled = true;
this.checkBox2.Enabled = true;
this.textBox7.Text = "All Done.";
this.textBox7.BackColor = SystemColors.Control;
}
private void TermEmail()
{
this.button3.Enabled = false;
this.button4.Enabled = true;
this.textBox14.Enabled = true;
this.checkBox3.Enabled = true;
this.checkBox4.Enabled = true;
this.textBox8.Text = "All Done.";
this.textBox8.BackColor = SystemColors.Control;
}
private void button1_Click(object sender, EventArgs e)
{
try
{
if (this.textBox1.Text.Length > 0)
{
this.InitUrl();
uThread = new Thread(ExecuteUrl);
uThread.Start();
}
}
catch {
}
}
private void button4_Click(object sender, EventArgs e)
{
try
{
if (this.textBox14.Text.Length > 0)
{
this.InitEmail();
eThread = new Thread(ExecuteEmail);
eThread.Start();
mThread = new Thread(ExecuteExtractEmail);
mThread.Start();
}
}
catch
{
}
}
private void button2_Click(object sender, EventArgs e)
{
if (uThread != null)
{
if (uThread.IsAlive)
{
uThread.Abort();
uThread = null;
}
}
this.TermUrl();
}
private void button3_Click(object sender, EventArgs e)
{
if (eThread != null)
{
if (eThread.IsAlive)
{
eThread.Abort();
eThread = null;
}
}
if (mThread != null)
{
if (mThread.IsAlive)
{
mThread.Abort();
mThread = null;
}
}
this.TermEmail();
}
private void ExecuteUrl()
{
try
{
this.ExtractUrls(this.textBox1.Text);
this.TermUrl();
}
catch (System.Exception e) {
}
}
private void ExecuteEmail()
{
try
{
this.ExtractEmails(this.textBox14.Text);
this.TermEmail();
}
catch (System.Exception e) {
}
}
private void ExecuteExtractEmail()
{
try
{
while (mThread != null)
{
LoopThroughListAndExtractEmail();
Thread.Sleep(3000);
}
}
catch {
}
}
private void LoopThroughListAndExtractEmail()
{
try
{
foreach (KeyValuePair<string, bool> kv in this.dEmail)
{
if (kv.Value == true)
{
HttpWebRequest hwr = (HttpWebRequest)WebRequest.Create(kv.Key);
hwr.Timeout = 3000;
HttpWebResponse wr = (HttpWebResponse)hwr.GetResponse();
StreamReader sr = new StreamReader(wr.GetResponseStream());
String[] sEmails = Common.ExtractEmails(sr.ReadToEnd());
foreach (String sEmail in sEmails)
{
if (!this.qEmail.ContainsKey(sEmail))
{
this.qEmail.Add(sEmail, true);
this.richTextBox2.AppendText(sEmail);
this.richTextBox2.AppendText("\r\n");
this.richTextBox2.Focus();
this.richTextBox2.ScrollToCaret();
this.textBox10.Text = Convert.ToString(++ieTotal);
if (this.checkBox4.Checked)
this.AppendToTxtFileEmail(sEmail);
if (this.checkBox3.Checked)
this.AppendToXmlFileEmail(sEmail);
}
}
this.dEmail[kv.Key] = false;
}
}
}
catch {
}
}
private void ExtractUrls(String sUrl)
{
try
{
if (!sUrl.StartsWith("http"))
return;
if (this.dUrl.ContainsKey(sUrl))
{
this.textBox4.Text = Convert.ToString(++iuFiltered);
return;
}
if (iuTotal > 50)
{
Random r = new Random();
if (r.Next(2) == 1)
{
this.textBox3.Text = Convert.ToString(++iuIgnored);
return;
}
}
this.dUrl.Add(sUrl, true);
this.textBox2.Text = sUrl;
this.richTextBox1.AppendText(sUrl);
this.richTextBox1.AppendText("\r\n");
this.richTextBox1.Focus();
this.richTextBox1.ScrollToCaret();
this.textBox5.Text = Convert.ToString(++iuTotal);
if (this.checkBox1.Checked)
this.AppendToTxtFileUrl(sUrl);
if (this.checkBox2.Checked)
this.AppendToXmlFileUrl(sUrl);
HttpWebRequest hwr = (HttpWebRequest)WebRequest.Create(sUrl);
hwr.Timeout = 10000;
HttpWebResponse wr = (HttpWebResponse) hwr.GetResponse();
StreamReader sr = new StreamReader(wr.GetResponseStream());
String[] sUrls = Common.ExtractURLs(sr.ReadToEnd());
foreach (String surl in sUrls)
{
if (this.CanIgnoreUrl(surl))
{
this.textBox3.Text = Convert.ToString(++iuIgnored);
return;
}
else
{
this.ExtractUrls(surl);
}
}
}
catch (System.Exception e)
{
this.textBox6.Text = Convert.ToString(++iuSkipped);
}
}
private void ExtractEmails(String sUrl)
{
try
{
if (!sUrl.StartsWith("http"))
return;
if (this.dEmail.ContainsKey(sUrl))
{
this.textBox11.Text = Convert.ToString(++ieFiltered);
return;
}
if (ieTotal > 50)
{
Random r = new Random();
if (r.Next(2) == 1)
{
this.textBox12.Text = Convert.ToString(++ieIgnored);
return;
}
}
this.dEmail.Add(sUrl, true);
this.textBox13.Text = sUrl;
HttpWebRequest hwr = (HttpWebRequest)WebRequest.Create(sUrl);
hwr.Timeout = 10000;
HttpWebResponse wr = (HttpWebResponse)hwr.GetResponse();
StreamReader sr = new StreamReader(wr.GetResponseStream());
String[] sUrls = Common.ExtractURLs(sr.ReadToEnd());
foreach (String surl in sUrls)
{
if (this.CanIgnoreUrl(surl))
{
this.textBox12.Text = Convert.ToString(++ieIgnored);
return;
}
else
{
this.ExtractEmails(surl);
}
}
}
catch (System.Exception e)
{
this.textBox9.Text = Convert.ToString(++ieSkipped);
}
}
private bool CanIgnoreUrl(string sUrl)
{
if (sUrl.EndsWith(".exe") ||
sUrl.EndsWith(".bin") ||
sUrl.EndsWith(".tar") ||
sUrl.EndsWith(".mp3") ||
sUrl.EndsWith(".avi") ||
sUrl.EndsWith(".gif") ||
sUrl.EndsWith(".doc") ||
sUrl.EndsWith(".pdf") ||
sUrl.EndsWith(".jpg") ||
sUrl.EndsWith(".mpeg") ||
sUrl.EndsWith(".gz"))
return true;
return false;
}
private void Form1_FormClosing(object sender, FormClosingEventArgs e)
{
this.button2_Click(sender, e);
this.button3_Click(sender, e);
}
private void saveFileToolStripMenuItem_Click(object sender, EventArgs e)
{
this.saveFileDialog1.ShowDialog();
}
private void exitToolStripMenuItem_Click(object sender, EventArgs e)
{
Application.Exit();
}
private void saveFileDialog1_FileOk(object sender, CancelEventArgs e)
{
if (this.tabControl1.SelectedIndex == 0)
{
using (TextWriter tw = new StreamWriter(this.saveFileDialog1.FileName))
{
foreach (KeyValuePair<string, bool> kv in this.dUrl)
{
tw.WriteLine(kv.Key);
}
tw.Close();
}
}
else
{
using (TextWriter tw = new StreamWriter(this.saveFileDialog1.FileName))
{
foreach (KeyValuePair<string, bool> kv in this.qEmail)
{
tw.WriteLine(kv.Key);
}
tw.Close();
}
}
}
private void AppendToTxtFileUrl(String str)
{
String sFile = String.Format("{0}ExtractURL_{1}.txt", Common.GetCurrentDirectory(), Common.GetNow_YYYYMMDD());
using (TextWriter tw = new StreamWriter(sFile, true))
{
tw.WriteLine(str);
tw.Close();
}
}
private void AppendToTxtFileEmail(String str)
{
String sFile = String.Format("{0}ExtractEmail_{1}.txt", Common.GetCurrentDirectory(), Common.GetNow_YYYYMMDD());
using (TextWriter tw = new StreamWriter(sFile, true))
{
tw.WriteLine(str);
tw.Close();
}
}
private void AppendToXmlFileUrl(String str)
{
String sFile = String.Format("{0}ExtractURL_{1}.xml", Common.GetCurrentDirectory(), Common.GetNow_YYYYMMDD());
if (!File.Exists(sFile))
{
XmlTextWriter xw = new XmlTextWriter(sFile, System.Text.Encoding.UTF8);
xw.Formatting = Formatting.Indented;
xw.WriteProcessingInstruction("xml", "version='1.0' encoding='UTF-8'");
xw.WriteComment("Epic Extractor - XML Version 5.0");
xw.WriteStartElement("ExtractedData");
xw.WriteFullEndElement();
xw.Close();
}
XmlDocument xd = new XmlDocument();
xd.Load(sFile);
XmlElement xRoot = xd.DocumentElement;
XmlElement xElem = xd.CreateElement("Url");
XmlText xText = xd.CreateTextNode(str);
xRoot.AppendChild(xElem);
xRoot.LastChild.AppendChild(xText);
xd.Save(sFile);
}
private void AppendToXmlFileEmail(String str)
{
String sFile = String.Format("{0}ExtractEmail_{1}.xml", Common.GetCurrentDirectory(), Common.GetNow_YYYYMMDD());
if (!File.Exists(sFile))
{
XmlTextWriter xw = new XmlTextWriter(sFile, System.Text.Encoding.UTF8);
xw.Formatting = Formatting.Indented;
xw.WriteProcessingInstruction("xml", "version='1.0' encoding='UTF-8'");
xw.WriteComment("Epic Extractor - XML Version 5.0");
xw.WriteStartElement("ExtractedData");
xw.WriteFullEndElement();
xw.Close();
}
XmlDocument xd = new XmlDocument();
xd.Load(sFile);
XmlElement xRoot = xd.DocumentElement;
XmlElement xElem = xd.CreateElement("Email");
XmlText xText = xd.CreateTextNode(str);
xRoot.AppendChild(xElem);
xRoot.LastChild.AppendChild(xText);
xd.Save(sFile);
}
private void aboutToolStripMenuItem_Click(object sender, EventArgs e)
{
About a = new About();
a.Show();
}
private void pictureBox1_Click(object sender, EventArgs e)
{
Process.Start("http://www.yahoo.com");
}
private void pictureBox2_Click(object sender, EventArgs e)
{
Process.Start("http://www.yahoo.com");
}
}
}